In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, \
OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score

import warnings
warnings.filterwarnings('ignore')
In [2]:
df = pd.read_csv('Thyroid.csv')
df
Out[2]:
age sex on_thyroxine query_on_thyroxine on_antithyroid_meds sick pregnant thyroid_surgery I131_treatment query_hypothyroid ... TT4 T4U_measured T4U FTI_measured FTI TBG_measured TBG referral_source target patient_id
0 29 F f f f f f f f t ... NaN f NaN f NaN f NaN other - 840801013
1 29 F f f f f f f f f ... 128.0 f NaN f NaN f NaN other - 840801014
2 41 F f f f f f f f f ... NaN f NaN f NaN t 11.0 other - 840801042
3 36 F f f f f f f f f ... NaN f NaN f NaN t 26.0 other - 840803046
4 32 F f f f f f f f f ... NaN f NaN f NaN t 36.0 other S 840803047
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9167 56 M f f f f f f f f ... 64.0 t 0.83 t 77.0 f NaN SVI - 870119022
9168 22 M f f f f f f f f ... 91.0 t 0.92 t 99.0 f NaN SVI - 870119023
9169 69 M f f f f f f f f ... 113.0 t 1.27 t 89.0 f NaN SVI I 870119025
9170 47 F f f f f f f f f ... 75.0 t 0.85 t 88.0 f NaN other - 870119027
9171 31 M f f f f f f f t ... 66.0 t 1.02 t 65.0 f NaN other - 870119035

9172 rows × 31 columns

In [3]:
df.dtypes
Out[3]:
age                      int64
sex                     object
on_thyroxine            object
query_on_thyroxine      object
on_antithyroid_meds     object
sick                    object
pregnant                object
thyroid_surgery         object
I131_treatment          object
query_hypothyroid       object
query_hyperthyroid      object
lithium                 object
goitre                  object
tumor                   object
hypopituitary           object
psych                   object
TSH_measured            object
TSH                    float64
T3_measured             object
T3                     float64
TT4_measured            object
TT4                    float64
T4U_measured            object
T4U                    float64
FTI_measured            object
FTI                    float64
TBG_measured            object
TBG                    float64
referral_source         object
target                  object
patient_id               int64
dtype: object
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9172 entries, 0 to 9171
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  9172 non-null   int64  
 1   sex                  8865 non-null   object 
 2   on_thyroxine         9172 non-null   object 
 3   query_on_thyroxine   9172 non-null   object 
 4   on_antithyroid_meds  9172 non-null   object 
 5   sick                 9172 non-null   object 
 6   pregnant             9172 non-null   object 
 7   thyroid_surgery      9172 non-null   object 
 8   I131_treatment       9172 non-null   object 
 9   query_hypothyroid    9172 non-null   object 
 10  query_hyperthyroid   9172 non-null   object 
 11  lithium              9172 non-null   object 
 12  goitre               9172 non-null   object 
 13  tumor                9172 non-null   object 
 14  hypopituitary        9172 non-null   object 
 15  psych                9172 non-null   object 
 16  TSH_measured         9172 non-null   object 
 17  TSH                  8330 non-null   float64
 18  T3_measured          9172 non-null   object 
 19  T3                   6568 non-null   float64
 20  TT4_measured         9172 non-null   object 
 21  TT4                  8730 non-null   float64
 22  T4U_measured         9172 non-null   object 
 23  T4U                  8363 non-null   float64
 24  FTI_measured         9172 non-null   object 
 25  FTI                  8370 non-null   float64
 26  TBG_measured         9172 non-null   object 
 27  TBG                  349 non-null    float64
 28  referral_source      9172 non-null   object 
 29  target               9172 non-null   object 
 30  patient_id           9172 non-null   int64  
dtypes: float64(6), int64(2), object(23)
memory usage: 2.2+ MB
In [5]:
df.drop("patient_id", axis=1, inplace=True)
In [6]:
columns_to_keep = ['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'TBG', 'target']

# Drop columns except the specified ones
df = df[columns_to_keep]
df
Out[6]:
age sex TSH T3 TT4 T4U FTI TBG target
0 29 F 0.3 NaN NaN NaN NaN NaN -
1 29 F 1.6 1.9 128.0 NaN NaN NaN -
2 41 F NaN NaN NaN NaN NaN 11.0 -
3 36 F NaN NaN NaN NaN NaN 26.0 -
4 32 F NaN NaN NaN NaN NaN 36.0 S
... ... ... ... ... ... ... ... ... ...
9167 56 M NaN NaN 64.0 0.83 77.0 NaN -
9168 22 M NaN NaN 91.0 0.92 99.0 NaN -
9169 69 M NaN NaN 113.0 1.27 89.0 NaN I
9170 47 F NaN NaN 75.0 0.85 88.0 NaN -
9171 31 M NaN NaN 66.0 1.02 65.0 NaN -

9172 rows × 9 columns

In [7]:
import pandas as pd

# Assuming df is your DataFrame containing the data
# Drop the 'TBG' column
df.drop('TBG', axis=1, inplace=True)

# Remove entries with 'target' column equal to "-"
df = df[df['target'] != "-"]

# Resetting index after dropping rows
df.reset_index(drop=True, inplace=True)

# Display the modified DataFrame
df
Out[7]:
age sex TSH T3 TT4 T4U FTI target
0 32 F NaN NaN NaN NaN NaN S
1 63 F 68.00 NaN 48.0 1.02 47.0 F
2 75 F 0.05 1.6 157.0 0.89 176.0 AK
3 41 M 0.05 1.6 39.0 1.00 39.0 R
4 71 F 0.05 NaN 126.0 1.38 91.0 I
... ... ... ... ... ... ... ... ...
2396 64 M 0.81 NaN 31.0 0.55 56.0 K
2397 60 M 0.18 NaN 28.0 0.87 32.0 K
2398 64 M NaN NaN 44.0 0.53 83.0 J
2399 36 F NaN NaN 84.0 1.26 67.0 I
2400 69 M NaN NaN 113.0 1.27 89.0 I

2401 rows × 8 columns

In [8]:
import pandas as pd

# Display count of rows before removing null values
print("Number of rows before removing null values:", df.shape[0])

# Remove null values
df = df.dropna()

# Display count of rows after removing null values
print("Number of rows after removing null values:", df.shape[0])
Number of rows before removing null values: 2401
Number of rows after removing null values: 1621
In [9]:
print(df.isnull().sum())
age       0
sex       0
TSH       0
T3        0
TT4       0
T4U       0
FTI       0
target    0
dtype: int64
In [10]:
# List of valid target subdiseases
valid_subdiseases = ['A', 'B', 'F', 'G']

# Display row count before dropping
print("Row count before dropping:", len(df))

# Drop rows where 'target' column does not contain valid subdiseases
df = df[df['target'].isin(valid_subdiseases)]

# Display row count after dropping
print("Row count after dropping:", len(df))
Row count before dropping: 1621
Row count after dropping: 484
In [11]:
# Display count of each target class
target_counts = df['target'].value_counts()
print("Count of each target class:")
print(target_counts)
Count of each target class:
target
G    211
F    164
A     93
B     16
Name: count, dtype: int64
In [12]:
df.duplicated().sum()
Out[12]:
0
In [13]:
target_counts = df['target'].value_counts()
fig = px.bar(x=target_counts.index, y=target_counts.values)
fig.update_layout(xaxis_title='Target', yaxis_title='Count', title='Distribution of Target')
fig.show()
In [14]:
df['sex'].replace({'M': 0, 'F': 1}, inplace=True)
In [15]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
In [16]:
df
Out[16]:
age sex TSH T3 TT4 T4U FTI target
9 61 0 9.799999 1.2 114.0 0.84 136.0 G
10 27 1 90.000000 0.4 7.5 0.94 7.5 F
32 40 1 70.000000 0.4 3.9 0.83 5.0 F
40 57 0 0.250000 4.2 236.0 0.70 337.0 A
44 44 1 8.400000 1.8 108.0 1.01 107.0 G
... ... ... ... ... ... ... ... ...
2360 71 1 23.000000 1.8 87.0 0.96 91.0 G
2369 51 1 106.000000 0.6 5.0 0.89 5.5 F
2372 66 1 85.000000 1.8 118.0 1.27 93.0 G
2391 75 1 17.000000 1.4 104.0 1.15 90.0 G
2392 74 1 53.000000 1.0 49.0 1.25 39.0 F

484 rows × 8 columns

In [17]:
print(df.isnull().sum())
age       0
sex       0
TSH       0
T3        0
TT4       0
T4U       0
FTI       0
target    0
dtype: int64
In [18]:
df.dtypes
Out[18]:
age         int64
sex         int64
TSH       float64
T3        float64
TT4       float64
T4U       float64
FTI       float64
target     object
dtype: object
In [19]:
# Convert the 'target' column to numerical values
df['target'] = df['target'].astype('category').cat.codes
In [20]:
df
Out[20]:
age sex TSH T3 TT4 T4U FTI target
9 61 0 9.799999 1.2 114.0 0.84 136.0 3
10 27 1 90.000000 0.4 7.5 0.94 7.5 2
32 40 1 70.000000 0.4 3.9 0.83 5.0 2
40 57 0 0.250000 4.2 236.0 0.70 337.0 0
44 44 1 8.400000 1.8 108.0 1.01 107.0 3
... ... ... ... ... ... ... ... ...
2360 71 1 23.000000 1.8 87.0 0.96 91.0 3
2369 51 1 106.000000 0.6 5.0 0.89 5.5 2
2372 66 1 85.000000 1.8 118.0 1.27 93.0 3
2391 75 1 17.000000 1.4 104.0 1.15 90.0 3
2392 74 1 53.000000 1.0 49.0 1.25 39.0 2

484 rows × 8 columns

In [21]:
df.dtypes
Out[21]:
age         int64
sex         int64
TSH       float64
T3        float64
TT4       float64
T4U       float64
FTI       float64
target       int8
dtype: object
In [22]:
cols_to_drop = ['sex', 'target']
data_subset = df.drop(cols_to_drop, axis=1)

# Plot boxplots
plt.figure(figsize=(12, 8))
sns.boxplot(data=data_subset)
plt.title('Distribution of Numerical Features')
plt.show()
In [23]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
In [24]:
data_cleaned = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
print(data_cleaned.describe())
              age    sex         TSH          T3         TT4         T4U  \
count  240.000000  240.0  240.000000  240.000000  240.000000  240.000000   
mean    55.920833    1.0   19.507875    1.696667   77.457500    1.010792   
std     17.894565    0.0   16.719436    0.697507   34.421876    0.132440   
min      7.000000    1.0    0.020000    0.200000    2.000000    0.670000   
25%     42.750000    1.0    7.875000    1.375000   58.000000    0.920000   
50%     59.000000    1.0   12.000000    1.700000   80.000000    1.000000   
75%     71.000000    1.0   28.000000    2.100000   99.000000    1.090000   
max     88.000000    1.0   77.000000    4.300000  147.000000    1.360000   

              FTI      target  
count  240.000000  240.000000  
mean    77.015833    2.620833  
std     32.756725    0.565744  
min      2.000000    1.000000  
25%     60.000000    2.000000  
50%     80.000000    3.000000  
75%    100.000000    3.000000  
max    139.000000    3.000000  
In [25]:
df.hist(figsize=(12, 10), bins=20)
plt.suptitle('Histogram of Numerical Features')
plt.show()

# Plot pairplot to visualize the distribution and relationships between features
sns.pairplot(data_cleaned, diag_kind='kde', hue='target')
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()

# Plot boxplots to visualize the distribution and identify outliers
cols_to_drop = ['sex', 'target']
data_subset = data_cleaned.drop(cols_to_drop, axis=1)

plt.figure(figsize=(12, 8))
sns.boxplot(data=data_subset)
plt.title('Boxplot of Numerical Features')
plt.show()
In [26]:
target_counts = df['target'].value_counts(normalize=True) * 100

# Print the results
print("Percentage of individuals based on the 'target' column (thyroid dataset):")
for target, percentage in target_counts.items():
    print(f"Target {target}: {percentage:.2f}%")
Percentage of individuals based on the 'target' column (thyroid dataset):
Target 3: 43.60%
Target 2: 33.88%
Target 0: 19.21%
Target 1: 3.31%
In [27]:
import seaborn as sns

# Define the list of numerical variables in your dataset
numerical_vars = ['age', 'TSH', 'T3', 'TT4', 'T4U', 'FTI']

plt.figure(figsize=(10, 8))
sns.heatmap(data_cleaned[numerical_vars].corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.show()
In [28]:
# Feature Engineering: Create a new feature representing the ratio of T3 to TT4
data_cleaned['T3_TT4_ratio'] = data_cleaned['T3'] / data_cleaned['TT4']
In [29]:
# Feature Selection: Calculate the correlation between features and the target variable
correlation = data_cleaned.corr()['target'].abs().sort_values(ascending=False)

# Select the top k most correlated features
k = 5  # Number of features to select
selected_features = correlation[1:k+1].index.tolist()  # Exclude the target column
In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load your cleaned data into a DataFrame named data_cleaned

# Feature Engineering
data_cleaned['T3_TT4_ratio'] = data_cleaned['T3'] / data_cleaned['TT4']

# Feature Selection
correlation = data_cleaned.corr()['target'].abs().sort_values(ascending=False)
k = 5  # Number of features to select
selected_features = correlation[1:k+1].index.tolist()  # Exclude the target column
In [31]:
#Split Dataset into Training and Testing Sets
from sklearn.model_selection import train_test_split

X = data_cleaned.drop('target', axis=1)
y = data_cleaned['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}
In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Calculate metrics for each model
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')  # Specify average='weighted' for multiclass
    recall = recall_score(y_test, y_pred, average='weighted')  # Specify average='weighted' for multiclass
    f1 = f1_score(y_test, y_pred, average='weighted')  # Specify average='weighted' for multiclass
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Display results
import pandas as pd
results_df = pd.DataFrame(results).T
print(results_df)
                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.979167   0.980655  0.979167  0.979392
SVM                  0.958333   0.919048  0.958333  0.938113
Random Forest        1.000000   1.000000  1.000000  1.000000
In [34]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Ensure your data is correctly loaded and cleaned
# Example: data_cleaned = pd.read_csv('path_to_your_data.csv')

# Define the selected features and target variable, including the engineered feature
selected_features = ['age', 'sex', 'TSH', 'T3', 'TT4', 'T4U', 'FTI', 'T3_TT4_ratio']
X = data_cleaned[selected_features]
y = data_cleaned['target']

# Check for missing values
if X.isnull().any().any() or y.isnull().any():
    raise ValueError("Data contains missing values. Please handle them before proceeding.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the logistic regression model
logistic_model = LogisticRegression(C=1.0, solver='liblinear', random_state=42)
logistic_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print("Model Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
Model Evaluation Metrics:
Accuracy: 0.9583
Precision: 0.9598
Recall: 0.9583
F1 Score: 0.9552
In [35]:
from sklearn.metrics import accuracy_score

# Assuming models is a dictionary containing model objects
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy:.2f}")
Logistic Regression Accuracy: 0.98
SVM Accuracy: 0.96
Random Forest Accuracy: 1.00
In [36]:
# Dictionary to store accuracies
accuracies = {}

# Iterate through each model
for name, model in models.items():
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store accuracy in the dictionary
    accuracies[name] = accuracy

# Plotting
plt.figure(figsize=(10, 6))
plt.barh(list(accuracies.keys()), list(accuracies.values()), color='skyblue')
plt.xlabel('Accuracy')
plt.title('Accuracy of Different Models')
plt.xlim(0, 1)
plt.show()
In [37]:
import pickle

# Assuming you have a model named 'model' that you want to save
filename = 'thyroid.sav'
pickle.dump(logistic_model, open(filename, 'wb'))
In [38]:
df
Out[38]:
age sex TSH T3 TT4 T4U FTI target
9 61 0 9.799999 1.2 114.0 0.84 136.0 3
10 27 1 90.000000 0.4 7.5 0.94 7.5 2
32 40 1 70.000000 0.4 3.9 0.83 5.0 2
40 57 0 0.250000 4.2 236.0 0.70 337.0 0
44 44 1 8.400000 1.8 108.0 1.01 107.0 3
... ... ... ... ... ... ... ... ...
2360 71 1 23.000000 1.8 87.0 0.96 91.0 3
2369 51 1 106.000000 0.6 5.0 0.89 5.5 2
2372 66 1 85.000000 1.8 118.0 1.27 93.0 3
2391 75 1 17.000000 1.4 104.0 1.15 90.0 3
2392 74 1 53.000000 1.0 49.0 1.25 39.0 2

484 rows × 8 columns

In [ ]: